Six basic functions:
# House scraping: get_df_suburb
get_df_suburb <- function(location = "2151/Parramatta/"){
# adapted from https://embracingtherandom.com/r/web-scraping/rent-scraping/
# determine how many pages to scroll through
tryCatch({
location <- gsub("\\s+", "+", location)
# print(location)
url <- paste0("https://www.auhouseprices.com/sold/list/NSW/",
location,
"1/?type=townhouse&ymin=0&ymax=0&bmin=0&bmax=0&pmin=0&pmax=0&sort=date&kw=") # type set to townhouse, no other filtering
# print(url)
webpage <- read_html(url)
# get the number of properties and the number of property displayed on each page
find_page_number <- webpage %>% html_nodes("h2") %>% html_text()
find_page_number <- find_page_number[1]
numbers <- as.numeric(regmatches(find_page_number, gregexpr("[0-9]+", find_page_number))[[1]])
end_page <- ceiling(numbers[3] / numbers[2]) # number of total properties / number on page = total number of pages
df <- NULL
# print(paste0(location, ": begins 0/4"))
# print(paste0( "Current suburb: ", location) )
# print(paste0( "Total pages ", end_page) )
for (this_page in c(1:end_page)){
# # print(paste0( "Processing page ", this_page) )
if (this_page %% 5 == 0){
# print(paste0("Page processed: ", this_page, "/", end_page))
}
# get website text
url <- paste0("https://www.auhouseprices.com/sold/list/NSW/",
location,
this_page,
"/?type=townhouse&ymin=0&ymax=0&bmin=0&bmax=0&pmin=0&pmax=0&sort=date&kw=") # type set to townhouse, no other filtering
webpage <- read_html(url)
result <- webpage %>% html_nodes("li") %>% html_text()
# end of the relevant content
result <- result[ 1: grep("current", result) ]
# remove the redundant "listed price"
result <- result[ !grepl("List", result) ]
# remove the price listed with rent
result <- result[ !grepl("Rent", result) ]
# filter information on price and number of bedroom/bathroom/carspace
price_bedroom <- result[ grep("\\$", result)]
price_bedroom <- strsplit( price_bedroom , "\\$")
bedroom <- lapply(price_bedroom, `[`, 1)
bedroom <- strsplit(unlist( trimws( bedroom) ) , "\\s+")
price <- lapply(price_bedroom, `[`, 2)
price <- trimws(price)
price <- as.numeric(gsub(",","", price ))
# filter information on sold month and year
# note sometimes the price is not listed , therefore only get the ones with the price
timesold <- result[ grep("\\$", result)-1]
timesold <- trimws( gsub("Sold on","", timesold ))
# whether to use day month year or just month year
timesold <- lapply(timesold , function(x){
check_format <- strsplit(x, "\\s")
if (length(check_format[[1]]) == 3){
x <- dmy(x)
}else if (length(check_format[[1]]) == 2){
x <- my(x)
}else{
x <- as.Date(paste0(x, "-01-01"))
}
x
})
timesold <- do.call("c", timesold)
# get address of these properties
address <- webpage %>% html_nodes("h4") %>% html_text()
# end of the relevant content
address <- address[ 1: grep("Auction History", address) -1 ]
#decide which address contain sold price
sold_info <- grep("Sold on", result) #entry with sold info
price_info <- grep("\\$", result) #entry with price info
contain_price <- sold_info %in% c(price_info-1) #for every sold entry, the immediate next row should be price, if not, then this sold entry does not have price record
address <- address[contain_price] #only record those property that has price recorded
temp_df <- data.frame( address = address,
bedroom = as.numeric( unlist( lapply( bedroom, `[`, 1) ) ) ,
bathroom = as.numeric( unlist( lapply( bedroom, `[`, 2) )) ,
carspace = as.numeric( unlist( lapply( bedroom, `[`, 3) )),
soldprice = price ,
yearsold =timesold )
df <- rbind(df, temp_df)
}
# Borrowed from ChatGPT
# create a new column called "index" with a sequence of numbers
df <- df %>% mutate(House_ID = 1:nrow(.))
# move the "index" column to the front of the data frame
df <- df[, c("House_ID", names(df)[-ncol(df)])]
# print(paste0("Page processed: ", this_page, "/", end_page))
# print(paste0(location, ": 1/4: get_df_suburb: creating data frame done!"))
return(df)
}, error = function(e) {
# Error handling code
# Set the file path and name
file_path <- "main1_export_Brandon_log/"
file_name <- "main1_export_Brandon_log.txt"
# Create the directory if it doesn't exist
if(!dir.exists(file_path)){
dir.create(file_path)
}
# Write location to the file
write(location, file.path(file_path, file_name), append = TRUE)
return(NULL)
})
}
add_distance_between <- function(lat, lon, fixed_lat, fixed_lon) {
dist <- distHaversine(c(lon, lat), c(fixed_lon, fixed_lat))
return(dist)
}
get_l_suburb_dist <- function(df_suburb, suburb_lat, suburb_lon, location) {
l_suburb <- df_suburb %>% geocode(address, method = 'arcgis', lat=latitude, long=longitude)
# print(paste0(location, ": 2/4: get_l_suburb: done!"))
l_suburb_dist <- data.frame(
l_suburb, distance_to_train_station = apply(
l_suburb[,c("latitude","longitude")], 1, function(x) add_distance_between(x[1], x[2], suburb_lat, suburb_lon))
)
# print(paste0(location, ": 3/4: get_l_suburb_dist: done!"))
return(l_suburb_dist)
}
export_l_suburb_dist_csv <- function(location, l_suburb_dist) {
# Writing the `l_granville_houseprice.csv` file in "~/csv_cache/"
file_name <- paste0("l_", gsub("/", "_", location), "houseprice.csv")
# print(file_name)
file_path <- file.path("csv_cache", file_name) # specify file path
write.csv(l_suburb_dist, file_path, row.names = FALSE) # export as CSV file
# print(paste0(location, ": 4/4: export_l_suburb_dist_csv: done!"))
return("Result: csv export finished")
}
export_a_suburb <- function(location, suburb_lat, suburb_lon) {
df_suburb <- get_df_suburb(location)
# Check if df_suburb is NULL (meaning an error occurred in get_df_suburb)
if (is.null(df_suburb)) {
return(NULL)
}
l_suburb_dist <- get_l_suburb_dist(df_suburb, suburb_lat, suburb_lon, location)
export_l_suburb_dist_csv(location, l_suburb_dist)
# print(paste0(location, ": Finish csv export"))
}
clear_log <- function() {
# Set the file path and name
file_path <- "main1_export_Brandon_log"
file_name <- "main1_export_Brandon_log.txt"
# Check if file exists before removing it
if (file.exists(file.path(file_path, file_name))) {
file.remove(file.path(file_path, file_name))
}
}
# Modified export_all_suburbs function with progress bar
export_all_suburbs <- function(file_name) {
cat("Exporting into csv_cache/ begins:\n")
# Clear the log
a <- clear_log()
# create directory if it doesn't exist
if (!dir.exists("~/csv_cache")) {
dir.create("csv_cache")
}
# Read the input file
suburbs_input <- read.table(file_name, header = FALSE, sep = ",", col.names = c("location", "latitude", "longitude"), strip.white = TRUE, comment.char = "", quote = "")
# Filter out rows starting with a '#' character
suburbs_input <- suburbs_input[!grepl("^#", suburbs_input$location), ]
# Randomize the order of rows
random_order <- sample(nrow(suburbs_input))
suburbs_input <- suburbs_input[random_order, ]
# Loop through each row in the input file and call export_a_suburb function
for (i in 1:nrow(suburbs_input)) {
location <- as.character(suburbs_input[i, "location"])
latitude <- as.numeric(suburbs_input[i, "latitude"])
longitude <- as.numeric(suburbs_input[i, "longitude"])
export_a_suburb(location, latitude, longitude)
# Print progress bar
progress <- i / nrow(suburbs_input)
num_hashes <- floor(progress * 100 / 2) # Assuming each '#' represents 2% of the progress
num_spaces <- 50 - num_hashes # Assuming the progress bar has 50 characters in total
cat("\n")
cat(sprintf("#%s%s (%.0f%%)\n", paste(rep("#", num_hashes), collapse = ""), paste(rep(" ", num_spaces), collapse = ""), progress * 100))
}
return(NULL)
}
Reading in files
# Get the list of CSV files in the 'csv_cache' directory
csv_files <- list.files(path = "csv_cache", pattern = "*.csv", full.names = TRUE)
# Initialize an empty data frame to store the combined data
combined_df <- data.frame()
# Loop through each file in the csv_files list
for (file in csv_files) {
# Read the CSV file
location_data <- read.csv(file)
# Categorize distance
location_data$"distance_to_train_station(km)" <- location_data$distance_to_train_station / 1000
# Classing distance
location_data$distance_class <- cut(location_data$"distance_to_train_station(km)",
breaks = c(0, 0.250, 0.500, 0.750, 1.000, 1.250, 1.500, 1.750, 2.000, 2.250, 2.500, 3.000, 3.250, 3.500, 3.750, 4.000))
# Combine the processed data frame with the combined_df data frame
combined_df <- rbind(combined_df, location_data)
}
# Inspect the combined data frame
head(combined_df)
## House_ID address bedroom bathroom carspace
## 1 1 10/92 Buckland Street Alexandria 2015 3 2 2
## 2 2 6/92 Buckland Street Alexandria 2015 3 2 2
## 3 3 14/18-20 Newton Street Alexandria 2015 2 1 1
## 4 4 PG09/11 Power Avenue Alexandria 2015 2 2 1
## 5 5 28A Gerard Street Alexandria 2015 3 3 NA
## 6 6 1/92 Buckland Street Alexandria 2015 3 2 2
## soldprice yearsold latitude longitude distance_to_train_station
## 1 2350000 2023-03-15 -33.90043 151.1947 967.6896
## 2 2350000 2023-01-30 -33.90043 151.1947 967.6896
## 3 1140000 2022-07-30 -33.89918 151.1909 1325.9023
## 4 1550000 2022-07-06 -33.90129 151.1983 669.3040
## 5 1230000 2022-06-18 -33.89771 151.1969 1077.5788
## 6 2425000 2022-05-06 -33.90043 151.1947 967.6896
## distance_to_train_station(km) distance_class
## 1 0.9676896 (0.75,1]
## 2 0.9676896 (0.75,1]
## 3 1.3259023 (1.25,1.5]
## 4 0.6693040 (0.5,0.75]
## 5 1.0775788 (1,1.25]
## 6 0.9676896 (0.75,1]
tail(combined_df)
## House_ID address bedroom bathroom
## 29781 810 7/24 Methven Street Mount Druitt 2770 3 1
## 29782 811 9/41 Methven Street Mount Druitt 2770 3 1
## 29783 812 1/21 Hythe Street Mount Druitt 2770 3 1
## 29784 813 1/14 Meacher Street Mount Druitt 2770 3 1
## 29785 814 4/34 Durham Street Mount Druitt 2770 3 1
## 29786 815 49/334 Woodstock Avenue Mount Druitt 2770 3 NA
## carspace soldprice yearsold latitude longitude
## 29781 1 154000 2001-09-01 -33.76247 150.8216
## 29782 1 156000 2001-09-01 -33.76186 150.8249
## 29783 1 400000 2001-08-01 -33.76308 150.8211
## 29784 1 189000 2001-08-01 -33.76042 150.8203
## 29785 1 178000 2001-07-01 -33.77166 150.8122
## 29786 NA 124000 2001-06-01 -33.75742 150.8206
## distance_to_train_station distance_to_train_station(km) distance_class
## 29781 802.9798 0.8029798 (0.75,1]
## 29782 967.3295 0.9673295 (0.75,1]
## 29783 729.2326 0.7292326 (0.5,0.75]
## 29784 1019.4336 1.0194336 (1,1.25]
## 29785 768.9916 0.7689916 (0.75,1]
## 29786 1354.9054 1.3549054 (1.25,1.5]
Filtering Data
combined_df_1bed <-filter(combined_df, bedroom ==1)
combined_df_2bed <-filter(combined_df, bedroom ==2)
combined_df_3bed <-filter(combined_df, bedroom ==3)
combined_df_4bed <-filter(combined_df, bedroom ==4)
combined_df_5bed <-filter(combined_df, bedroom ==5)
par(mfrow=c(1,2))
ggplot(combined_df_1bed, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs Distance from Train Station for 1 Bedroom", x="Distance from Train Station(km)", y="Selling Price (x$100000)", fill = "Number of Carspaces")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))+
theme(plot.title = element_text(hjust=0.25))

ggplot(combined_df_1bed, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5, aes(fill=factor(carspace))) +
labs(title = "Sold Price vs Distance from Train Station for 1 Bedroom", x="Distance from Train Station(km)", y="Selling Price (x$100000)", fill = "Number of Carspaces")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))+
theme(plot.title = element_text(hjust=0.25))

ggplot(combined_df_2bed, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs Distance from Train Station for 2 Bedrooms", x="Distance from Train Station(km)", y="Selling Price (x$100000)", fill = "Number of Carspaces")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))+
theme(plot.title = element_text(hjust=0.25))

summary(combined_df_2bed$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5.000e+04 3.500e+05 4.680e+05 7.890e+05 6.120e+05 2.147e+09
ggplot(combined_df_2bed, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5, aes(fill=factor(carspace))) +
labs(title = "Sold Price vs Distance from Train Station for 2 Bedrooms", x="Distance from Train Station(km)", y="Selling Price (x$100000)", fill = "Number of Carspaces")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))+
theme(plot.title = element_text(hjust=0.25))

summary(combined_df_2bed$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5.000e+04 3.500e+05 4.680e+05 7.890e+05 6.120e+05 2.147e+09
ggplot(combined_df_3bed, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs Distance from Train Station for 3 Bedrooms", x="Distance from Train Station(km)", y="Selling Price (x$100000)", fill = "Number of Carspaces")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))+
theme(plot.title = element_text(hjust=0.25))

summary(combined_df_3bed$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 575 425000 570000 642811 745000 22867454
ggplot(combined_df_3bed, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5, aes(fill=factor(carspace))) +
labs(title = "Sold Price vs Distance from Train Station for 3 Bedrooms", x="Distance from Train Station(km)", y="Selling Price (x$100000)", fill = "Number of Carspaces")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))+
theme(plot.title = element_text(hjust=0.25))

summary(combined_df_3bed$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 575 425000 570000 642811 745000 22867454
ggplot(combined_df_4bed, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs Distance from Train Station for 4 Bedrooms", x="Distance from Train Station(km)", y="Selling Price (x$100000)", fill = "Number of Carspaces")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))+
theme(plot.title = element_text(hjust=0.25))

summary(combined_df_4bed$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1650 534300 675000 763610 866000 15000000
ggplot(combined_df_4bed, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5, aes(fill=factor(carspace))) +
labs(title = "Sold Price vs Distance from Train Station for 4 Bedrooms", x="Distance from Train Station(km)", y="Selling Price (x$100000)", fill = "Number of Carspaces")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))+
theme(plot.title = element_text(hjust=0.25))

summary(combined_df_4bed$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1650 534300 675000 763610 866000 15000000
ggplot(combined_df_5bed, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs Distance from Train Station for 5 Bedrooms", x="Distance from Train Station(km)", y="Selling Price (x$100000)", fill = "Number of Carspaces")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))+
theme(plot.title = element_text(hjust=0.25))

summary(combined_df_5bed$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 200000 659990 783000 864065 930000 3080000
ggplot(combined_df_5bed, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5, aes(fill=factor(carspace))) +
labs(title = "Sold Price vs Distance from Train Station for 5 Bedrooms", x="Distance from Train Station(km)", y="Selling Price (x$100000)", fill = "Number of Carspaces")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))+
theme(plot.title = element_text(hjust=0.25))

summary(combined_df_5bed$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 200000 659990 783000 864065 930000 3080000
Filtering Data by Carspaces and Bedrooms
combined_df_1bed_1car <-filter(combined_df, bedroom ==1, carspace == 1)
combined_df_2bed_1car <-filter(combined_df, bedroom ==2, carspace == 1)
combined_df_2bed_2car <-filter(combined_df, bedroom ==2, carspace == 2)
combined_df_3bed_1car <-filter(combined_df, bedroom ==3, carspace == 1)
combined_df_3bed_2car <-filter(combined_df, bedroom ==3, carspace == 2)
combined_df_3bed_3car <-filter(combined_df, bedroom ==3, carspace == 3)
combined_df_3bed_4car <-filter(combined_df, bedroom ==3, carspace == 4)
combined_df_4bed_1car <-filter(combined_df, bedroom ==4, carspace == 1)
combined_df_4bed_2car <-filter(combined_df, bedroom ==4, carspace == 2)
combined_df_4bed_3car <-filter(combined_df, bedroom ==4, carspace == 3)
combined_df_4bed_4car <-filter(combined_df, bedroom ==4, carspace == 4)
combined_df_5bed_1car <-filter(combined_df, bedroom ==5, carspace == 1)
combined_df_5bed_2car <-filter(combined_df, bedroom ==5, carspace == 2)
combined_df_5bed_3car <-filter(combined_df, bedroom ==5, carspace == 3)
1 bedroom
ggplot(combined_df_1bed_1car, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs Distance from Train Station for 1 Bedroom and 1 Carspace", x="Distance from Train Station(km)", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_1bed_1car$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 61325 253000 375000 402900 487225 1330000
2 bedrooms
ggplot(combined_df_2bed_1car, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs Distance from Train Station for 2 Bedrooms and 1 Carspace", x="Distance from Train Station(km)", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_2bed_1car$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5.700e+04 3.400e+05 4.600e+05 8.584e+05 6.000e+05 2.147e+09
ggplot(combined_df_2bed_2car, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs Distance from Train Station for 2 Bedrooms and 2 Carspaces", x="Distance from Train Station(km)", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_2bed_2car$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 66000 432625 547500 588482 700000 1581000
3 bedrooms
ggplot(combined_df_3bed_1car, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs Distance from Train Station for 3 Bedrooms and 1 Carspace", x="Distance from Train Station(km)", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_3bed_1car$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 50000 360000 500000 549951 655000 5850000
ggplot(combined_df_3bed_2car, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs Distance from Train Station for 3 Bedrooms and 2 Carspaces", x="Distance from Train Station(km)", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_3bed_2car$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 575 500000 637000 723621 840000 22867454
ggplot(combined_df_3bed_3car, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs Distance from Train Station for 3 Bedrooms and 3 Carspaces", x="Distance from Train Station(km)", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_3bed_2car$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 575 500000 637000 723621 840000 22867454
ggplot(combined_df_3bed_4car, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs Distance from Train Station for 3 Bedrooms and 4 Carspaces", x="Distance from Train Station(km)", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_3bed_4car$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 305000 478500 590000 668278 809000 1600000
4 bedrooms
ggplot(combined_df_4bed_1car, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs Distance from Train Station for 4 Bedrooms and 1 Carspace", x="Distance from Train Station(km)", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_4bed_1car$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 175000 450000 620000 661696 770000 2750000
ggplot(combined_df_4bed_2car, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs Distance from Train Station for 4 Bedrooms and 2 Carspaces", x="Distance from Train Station(km)", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_4bed_2car$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1650 555000 690000 790771 890000 15000000
ggplot(combined_df_4bed_3car, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs Distance from Train Station for 4 Bedrooms and 3 Carspaces", x="Distance from Train Station(km)", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_4bed_3car$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 100000 615250 769000 927646 1023250 3000000
ggplot(combined_df_4bed_4car, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs Distance from Train Station for 4 Bedrooms and 4 Carspaces", x="Distance from Train Station(km)", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_4bed_4car$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 330000 535500 655000 715113 840000 1630000
5 bedrooms
ggplot(combined_df_5bed_1car, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs Distance from Train Station for 5 Bedrooms and 1 Carspace", x="Distance from Train Station(km)", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_5bed_1car$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 335000 602500 722500 743607 870000 1850000
ggplot(combined_df_5bed_2car, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs Distance from Train Station for 5 Bedrooms and 2 Carspaces", x="Distance from Train Station(km)", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_5bed_2car$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 200000 663748 820000 900051 960416 3080000
ggplot(combined_df_5bed_3car, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs Distance from Train Station for 5 Bedrooms and 3 Carspaces", x="Distance from Train Station(km)", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_5bed_3car$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 700000 738000 790000 787833 838250 910000
Creating a column for Year
combined_df$Year <- as.factor(format(as.Date(combined_df$yearsold), "%Y"))
# Filtering by year
combined_df_0.00 <-filter(combined_df, distance_class == "(0,0.25]")
combined_df_0.25 <-filter(combined_df, distance_class == "(0.25,0.5]")
combined_df_0.50 <-filter(combined_df, distance_class == "(0.5,0.75]")
combined_df_0.75 <-filter(combined_df, distance_class == "(0.75,1]")
combined_df_1.00 <-filter(combined_df, distance_class == "(1,1.25]")
combined_df_1.25 <-filter(combined_df, distance_class == "(1.25,1.5]")
combined_df_1.50 <-filter(combined_df, distance_class == "(1.5,1.75]")
combined_df_1.75 <-filter(combined_df, distance_class == "(1.75,2]")
combined_df_2.00 <-filter(combined_df, distance_class == "(2,2.25]")
combined_df_2.25 <-filter(combined_df, distance_class == "(2.25,2.5]")
combined_df_2.50 <-filter(combined_df, distance_class == "(2.5,2.75]")
combined_df_2.75 <-filter(combined_df, distance_class == "(2.75,3]")
combined_df_3.00 <-filter(combined_df, distance_class == "(3,3.25]")
combined_df_3.25 <-filter(combined_df, distance_class == "(3.25,3.5]")
combined_df_3.50 <-filter(combined_df, distance_class == "(3.5,3.75]")
combined_df_3.75 <-filter(combined_df, distance_class == "(3.75,4]")
ggplot(combined_df_0.00, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs year for townhouses 0 to 0.25km from train station", x="Year", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_0.00$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 50000 415000 565000 655697 780000 3300000
ggplot(combined_df_0.25, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs year for townhouses 0.25 to 0.50km from train station", x="Year", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_0.25$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5.000e+04 4.500e+05 5.975e+05 1.106e+06 7.910e+05 2.147e+09
ggplot(combined_df_0.50, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs year for townhouses 0.50 to 0.75km from train station", x="Year", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_0.50$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1650 425000 582500 652451 775000 15000000
ggplot(combined_df_0.75, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs year for townhouses 0.75 to 1.00km from train station", x="Year", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_0.75$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 60000 420000 549475 612502 718375 6203000
ggplot(combined_df_1.00, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs year for townhouses 1.00 to 1.25km from train station", x="Year", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_1.00$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 575 392500 540000 606991 720000 4400000
ggplot(combined_df_1.25, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs year for townhouses 1.25 to 1.50km from train station", x="Year", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_1.25$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 61325 378000 524000 572350 680000 4840000
ggplot(combined_df_1.50, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs year for townhouses 1.50 to 1.75km from train station", x="Year", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_1.50$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 92000 374462 522250 563676 650000 2812000
ggplot(combined_df_1.75, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs year for townhouses 1.75 to 2.00km from train station", x="Year", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_1.75$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 100000 359250 509500 566946 680000 3100000
ggplot(combined_df_2.00, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs year for townhouses 2.00 to 2.25km from train station", x="Year", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_2.00$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 132500 370000 490275 529213 620000 2430000
ggplot(combined_df_2.25, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs year for townhouses 2.25 to 2.50km from train station", x="Year", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_2.25$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 190000 382500 550000 592289 687000 5346000
ggplot(combined_df_2.50, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs year for townhouses 2.50 to 2.75km from train station", x="Year", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_2.50$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
##
ggplot(combined_df_2.75, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs year for townhouses 2.75 to 3.00km from train station", x="Year", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_2.75$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
##
ggplot(combined_df_3.00, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs year for townhouses 3.00 to 3.25km from train station", x="Year", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_3.00$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 180000 357500 500101 538112 600000 1777000
ggplot(combined_df_3.25, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs year for townhouses 3.25 to 3.75km from train station", x="Year", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_3.25$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 225000 300000 460000 446274 543000 1125000
ggplot(combined_df_3.50, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs year for townhouses 3.50 to 3.75km from train station", x="Year", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_3.50$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 250000 300000 355000 386685 441000 664000
ggplot(combined_df_3.75, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs year for townhouses 3.75 to 4.00km from train station", x="Year", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_3.75$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
##
ggplot(combined_df, aes(x = Year, y = soldprice/100000))+
geom_point(aes(color=distance_class)) +
labs(title = "Sold Price over Years", x="Year", y="Selling Price (x$100000)", fill = "Number of Carspaces")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))+
theme(plot.title = element_text(hjust=0.25))

ggplot(combined_df, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price over Years", x="Year", y="Selling Price (x$100000)", fill = "Number of Carspaces")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))+
theme(plot.title = element_text(hjust=0.25))

ggplot(combined_df, aes(x = factor(bedroom), y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price for Different Numbers of Bedrooms", x="Number of Bedrooms", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))+
theme(plot.title = element_text(hjust=0.25))

ggplot(combined_df, aes(x = factor(bathroom), y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price for Different Numbers of Bathrooms", x="Number of Bathrooms", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))+
theme(plot.title = element_text(hjust=0.25))

ggplot(combined_df, aes(x = factor(carspace), y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price for Different Numbers of Carspaces", x="Number of Carspaces", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))+
theme(plot.title = element_text(hjust=0.25))
